root_dir <- "/g/huber/users/fridljand/R/ihw-forest-paper/"
load(file.path(root_dir, "data/hqtl_chrom1_chrom2/snps1.filtered.allGT.txt.rda"))
snps1 <- snps
load(file.path(root_dir, "data/hqtl_chrom1_chrom2/snps2.filtered.allGT.txt.rda"))
snps2 <- snps
chr1_maf_download <- readRDS(file = file.path(root_dir, "data/downloaded_covariates/chr1_maf.Rds"))
chr2_maf_download <- readRDS(file = file.path(root_dir, "data/downloaded_covariates/chr2_maf.Rds"))
unique(unlist(apply(snps1[[1]], 1, unique)))
## [1] 0 1 2
#http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/faq.html
maf.list = vector('list', length(snps1))
for(sl in 1:length(snps1)) {
slice = snps1[[sl]];
maf.list[[sl]] = rowMeans(slice,na.rm=TRUE)/2;
maf.list[[sl]] = pmin(maf.list[[sl]],1-maf.list[[sl]]);
}
chr1_maf_sample <- data.frame(
refsnp_id = MatrixEQTL::rownames(snps1),
minor_allele_freq = unlist(maf.list)
)
hist(chr1_maf_sample$minor_allele_freq)
axis(1, at = seq(0.05, 1, by = 0.1), las=2)
saveRDS(chr1_maf_sample, file = file.path(root_dir, "data/hqtl_chrom1_chrom2/chr1_maf_sample.rds"))
#http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/faq.html
maf.list2 = vector('list', length(snps2))
for(sl in 1:length(snps2)) {
slice = snps2[[sl]];
maf.list2[[sl]] = rowMeans(slice,na.rm=TRUE)/2;
maf.list2[[sl]] = pmin(maf.list2[[sl]],1-maf.list2[[sl]]);
}
chr2_maf_sample <- data.frame(
refsnp_id = MatrixEQTL::rownames(snps2),
minor_allele_freq = unlist(maf.list2)
)
hist(chr2_maf_sample$minor_allele_freq)
saveRDS(chr2_maf_sample, file = file.path(root_dir, "data/hqtl_chrom1_chrom2/chr2_maf_sample.rds"))
chr1_maf_sample_download <- dplyr::inner_join(chr1_maf_sample %>% rename(minor_allele_freq_sample = minor_allele_freq),
chr1_maf_download %>% rename(minor_allele_freq_download = minor_allele_freq),
by = "refsnp_id")
chr1_maf_sample_download <- chr1_maf_sample_download %>% drop_na(minor_allele_freq_download)
chr1_maf_sample_download <- chr1_maf_sample_download[sample(nrow(chr1_maf_sample_download), 3000), ]
#cor(chr1_maf_sample_download$minor_allele_freq_sample,chr1_maf_sample_download$minor_allele_freq_download)
plot(chr1_maf_sample_download$minor_allele_freq_sample,
chr1_maf_sample_download$minor_allele_freq_download,
col = alpha("black", 0.3),
pch=16)
abline(coef = c(0,1))
Does not look very correlated.
sl = 1
sample_size = ncol(snps1)-1 #TODO -1?
maf_list_slice = chr1_maf_sample$minor_allele_freq[1:2000]
snps1_slice = snps1[[sl]];
p = maf_list_slice
q = 1-p # Calulate frequency of minor allele being present in homozygous and heterozygous state
f_dom_hom = q^2
f_hom = p^2
f_het = 2*p*q # Expected number of observations in a sample size of 300
expected_f_hom_het <- data.frame(
expected_0 = round(f_dom_hom * sample_size),
expected_1=round(f_het * sample_size),
expected_2 =round(f_hom * sample_size)
)
observed_f_hom_het <- apply(snps1_slice, 1, table)
observed_f_hom_het <- observed_f_hom_het %>%
lapply(function (df){
df <- as.data.frame(df)
pivot_wider(df, names_from = "Var1", values_from = "Freq")
})
observed_f_hom_het <- observed_f_hom_het %>%
data.table::rbindlist(use.names=TRUE, fill=TRUE)
observed_f_hom_het[is.na(observed_f_hom_het)] <- 0
expected_observed_f_hom_het <- cbind(expected_f_hom_het, observed_f_hom_het)
ggplot(expected_observed_f_hom_het, aes(x = expected_0, y = `0`))+
geom_point() +
geom_abline(slope = 1)
ggplot(expected_observed_f_hom_het, aes(x = expected_1, y = `1`))+
geom_point() +
geom_abline(slope = 1)
ggplot(expected_observed_f_hom_het, aes(x = expected_2, y = `2`))+
geom_point() +
geom_abline(slope = 1)